Say something about ggplot2
Data is accessible from: https://gtexportal.org/home/datasets
The actual file is: https://storage.googleapis.com/gtex_analysis_v7/single_tissue_eqtl_data/GTEx_Analysis_v7_eQTL.tar.gz
The unzipped file contain information about eQTL from 48 tissues.
Information includes:
The data from each tissue has been read in and stored in three data.frames. The code to do this is here:
dataDir <- "/Users/askol/Dropbox (Personal)/GTEx/V7/GTEx_Analysis_v7_eQTL/"
files <- dir(dataDir, pattern = "pairs")
tissues <- gsub("\\.v7.+","", files)
data <- list()
sampFile <- paste0(dataDir, "GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt")
sampInfo <- read.table(file = sampFile, header=T, as.is=T, sep="\t", quote = "")
sampInfo <- sampInfo %>% mutate(ID = gsub("-.+-.+$","",SAMPID))
ID <- sapply(sampInfo$SAMPID, strsplit, split="-")
ID <- sapply(ID, function(x){paste(x[1],x[2], sep="-")})
ID <- unlist(ID)
sampInfo$ID <- ID
a <- sampInfo %>% distinct(ID, SMTSD)
tissueN <- table(sampInfo$SMTSD)
names(tissueN) <- gsub(" - " , "_" , names(tissueN))
names(tissueN) <- gsub(" " , "_" , names(tissueN))
names(tissueN) <- gsub("\\(" , "" , names(tissueN))
names(tissueN) <- gsub("\\)" , "" , names(tissueN))
names(tissueN) <- gsub("Cells_Cultured_fibroblasts",
"Cells_Transformed_fibroblasts",
names(tissueN))
tissueN <- as.data.frame(tissueN)
names(tissueN) <- c("tissue", "N")
saveRDS(file = "Data/sampleSize.rds", tissueN)
for (i in 1:length(files)){
file <- paste0(dataDir, files[i])
print(paste0("Reading file", file))
data[[i]] <- fread(file, header=T)
}
## collect p-values
ps <- data[[1]][,c("variant_id","symbol", "pval_nominal")] %>%
mutate(chr = gsub("_.+","",variant_id)) %>% filter(chr %in% c(1,6,20))
ps <- ps %>% rename(!!tissues[1] := pval_nominal)
for (i in 2:length(data)){
print(paste0("Mergeing ",tissues[i]))
tmp <- data[[i]][, c("variant_id","symbol","pval_nominal")] %>%
mutate(chr = gsub("_.+","",variant_id)) %>% filter(chr %in% c(1,6,20))
tmp <- tmp %>% rename(!!tissues[i] := pval_nominal)
ps <- full_join(ps, tmp, by=c("variant_id" , "symbol", "chr"))
}
## collect betas
betas <- data[[1]][,c("variant_id","symbol", "slope")] %>%
mutate(chr = gsub("_.+","",variant_id)) %>% filter(chr %in% c(1,6,20))
betas <- betas %>% rename(!!tissues[1] := slope)
for (i in 2:length(data)){
print(paste0("Mergeing ",tissues[i]))
tmp <- data[[i]][, c("variant_id","symbol","slope")] %>%
mutate(chr = gsub("_.+","",variant_id)) %>% filter(chr %in% c(1,6,20))
tmp <- tmp %>% rename(!!tissues[i] := slope)
betas <- full_join(betas, tmp, by=c("variant_id" , "symbol", "chr"))
}
## save ps
psFile <- paste0(dataDir, "ps.rds")
saveRDS(file = psFile, ps)
## save betas
betasFile = paste0(dataDir,"betas.rds")
saveRDS(file = betasFile, betas)
Let’s load the ps and betas variables
dataDir <- "/Users/askol/Dropbox (Personal)/GTEx/V7/GTEx_Analysis_v7_eQTL/"
betasFile = "Data/betas.rds"
psFile <- "Data/ps.rds"
nFile <- "Data/sampleSize.rds"
betas = readRDS(file = betasFile)
ps = readRDS(file = psFile)
tissueN = readRDS(file = nFile)
## substitute gene names for ensembl ids in ps and betas
betas <- betas %>% mutate(gene_id = gsub("\\..+","", gene_id))
betas <- left_join(betas, grch38[,c("ensgene","symbol")], by = c("gene_id" = "ensgene")) %>% mutate(symbol = ifelse(is.na(symbol), gene_id, symbol))
ps <- ps %>% mutate(gene_id = gsub("\\..+", "", gene_id))
ps <- left_join(ps, grch38[,c("ensgene","symbol")], by=c("gene_id" = "ensgene")) %>%
mutate(symbol = ifelse(is.na(symbol), gene_id, symbol))
Now that we have some data to play with, we can start exploring how to perform some simple plotting functions using ggplot2.
First we’ll plot the number of eQTL as a function of tissue using a historgram. Since a non-NA cell indicates that it is an eQTL, all we have to do is sum the number of non-NA cells in a column (tissue) to count the number of eQTL.
neqtl <- ps %>% select(-chr, -variant_id, -gene_id)
neqtl <- colSums(!is.na(neqtl))
neqtl <- data.frame(tissue = names(neqtl), neqtl = neqtl)
Plots created with ggplot start with defining the dataset and the variables that will make up the x and y axis. This is done using the ggplot function, defining data, and the aesthetic variables (aes).
options(width=100)
p <- ggplot(data=neqtl, aes(x=tissue, y=neqtl))
plot(p)
Next we add the geom_ ‘function’ that defines the type of plot. Here we use geom_bar, because we want to make a barplot. These functions take arguments that are specific to the plot type and graphical elements specific to the plot.
options(width=100)
p <- p + geom_bar(stat="identity", fill = "steelblue")
print(p)
stat=“identity” means that the value in the data.frame is the value to be plotted. The default is stat=“count”, in which case it will count the number of elements in the data.frame column.
There are a few things that we can do to make this plot look nicer.
options(width=100)
p <- ggplot(data=neqtl, aes(x=tissue, y=neqtl)) +
geom_bar(stat="identity", fill = "steelblue") +
theme_minimal() +
theme(text=element_text(size=10),
axis.text.x=element_text(angle=60, hjust=1)) +
xlab("Tissue") +
ylab("Number of eQTL")
print(p)
One nice thing about ggplot is that the plot can be added to over multiple commands. This allows conditional statement to help determine what will be contained in the plot.
One challenge of plotting categorical variables is deciding how to order them on the x-axis in a meaningful or attractive manner. Here, because different tissues have different numbers of samples, it may be reasonable to organize the tissues by samples size. First, we’ll merge the eqtl number data with the sample size data.
neqtl <- left_join(neqtl, tissueN, by="tissue")
## Warning: Column `tissue` joining factors with different levels, coercing to character vector
neqtl$tissue <- factor(neqtl$tissue,
levels = neqtl$tissue[order(neqtl$N, decreasing = T)])
Next we repeat the plot above.
options(width=100)
p <- ggplot(neqtl, aes(x=tissue, y=neqtl)) +
geom_bar(stat="identity", fill = "steelblue") +
theme_minimal() +
theme(text=element_text(size=10),
axis.text.x=element_text(angle=60, hjust=1)) +
xlab("Tissue") +
ylab("Number of eQTL")
print(p)
O.k. That’s much better. But if we really want to know the relationship between sample size and the number of eQTL, then let’s plot the number of samples of each tissue type on the same plot. To do this we have to do two things. First is to plot the sample size. We’ll use geom_points to do this. The other thing, is that scale of the number of eQTL and number of samples is quite different.
range(tissueN$N)
## [1] 4 3288
range(neqtl$neqtl)
## [1] 27146 1733030
factorN <- 494913/3288
options(width=100)
p <- p + geom_point(aes(x=tissue, y=N*factorN)) +
scale_y_continuous(sec.axis = sec_axis(~./factorN, name = "Sample Size"))
print(p)
## Warning: Removed 1 rows containing missing values (geom_point).
Now we’ll consider the effect sizes of eQTL in order to see how to plot distributions (the relative frequence of the variable’s values). Let’s first use the histogram to look at the distribution of eQTL effects. To make things simpler, and to demonstrate how to compare distributions, we will focus on two tissues: “Skeletal Muscle and Subcutaneous Adipose”
TissSel <- c("Muscle_Skeletal", "Adipose_Subcutaneous")
bs <- betas %>% select(!!TissSel)
bs <- bs[rowSums(!is.na(bs)) > 0,]
bs <- as.data.table(bs)
bs <- melt(bs, measure=1:2, variable.name="tissue", value.name = "beta", na.rm=TRUE)
The function melt is very important for preparing data for ggplot. It transforms a data.frame from a wide rectangle to a long rectangle, whereby a subset of the column headers are converted to a column of values (specificied by variable.name), and the values in the converted columns are converted to another column (specificed by value.name). If there are columns that are not to be “longified” they are specified by id.vars.
betas[1:4,TissSel]
## Muscle_Skeletal Adipose_Subcutaneous
## 1 NA 0.914962
## 2 NA 0.705560
## 3 NA 0.271740
## 4 NA -0.804681
This is what the transformation looks like for the data that were going to be plotting:
bs[1:4,]
## tissue beta
## 1: Muscle_Skeletal -0.687663
## 2: Muscle_Skeletal -1.175840
## 3: Muscle_Skeletal -1.235690
## 4: Muscle_Skeletal -1.269570
A histrogram of the distribution of all the values is created using geom_hist
options(width=100)
p <- ggplot(data=bs, aes(x = abs(beta))) +
geom_histogram(binwidth=.1, colour="black", fill="white")
print(p)
If you prefer density functions, you can use
geom_density
p <- ggplot(bs, aes(x = abs(beta))) + geom_density(adjust = 1.15)
print(p)
What we really wanted to do was compare the distribution of effect sizes for different tissue types. To do this, we will use the fill attribute of the density function that is avalable within the ggplot function (or the geom_density function)
options(width=100)
p <- ggplot(bs, aes(x=beta, fill=tissue)) + geom_density(alpha=.3) +
scale_fill_discrete(name = "Tissue", labels=c("Muscle (skeletal)", "Adipose (subcutaneous)")) +
theme_minimal() + theme(legend.title.align=0.5)
print(p)
## Boxplots and violin plots What if we wanted to compare more than two tissues. It’s not hard to imagine that looking at more than 2 or 3 tissues via overlapping densities will start to get ugly and uninformative very quickly. An alternative is to use a box and wisker plot. First we will select a subset of tssues and create an appropriately formated
data.frame. We will also summarize the eQTL effects by gene, by calculating the mean eQTL effect for each gene.
tissSel <- c("symbol", "Adipose_Subcutaneous",
"Brain_Frontal_Cortex_BA9",
"Breast_Mammary_Tissue","Colon_Transverse", "Esophagus_Mucosa",
"Heart_Left_Ventricle","Liver", "Lung", "Muscle_Skeletal",
"Spleen", "Skin_Sun_Exposed_Lower_leg","Stomach")
tissueNabrv <- tissueN[tissueN$tissue %in% tissSel,]
tissueNabrv$tissue <- droplevels(tissueNabrv$tissue)
tissueNabrv$tissue <- factor(tissueNabrv$tissue, levels =
tissueNabrv$tissue[order(tissueNabrv$N,
decreasing=TRUE)])
bs <- betas %>% select(chr, !!tissSel) %>% as.data.table()
bs <- melt(bs, measure=3:length(tissSel), id = 1:2,
variable.name="tissue", value.name="beta")
bs <- bs %>% group_by(symbol, tissue) %>%
mutate(m.beta=mean(beta, na.rm=T)) %>% filter(row_number() == 1) %>%
select(-beta) %>% ungroup()
bs <- bs %>% filter(!is.na(m.beta)) %>%
mutate(tissue = factor(tissue, levels = levels(tissueNabrv$tissue)))
Now we will plot these gene-based mean eQTL effect sizes across tissues.
options(width=120)
p <- ggplot(bs, aes(x = tissue, y = m.beta)) +
geom_boxplot() + theme_minimal() +
theme(text=element_text(size=10),
axis.text.x=element_text(angle=60, hjust=1)) +
xlab("Tissue") +
ylab("Mean Beta")
print(p)
Now, what if we wanted to know if the effect size distribution differs by chromosomal location. To do this, we can use another aesthitic attribute. In this case we’ll use the fill attribute.
bs$chr <- factor(bs$chr, levels = c("1","6","20"))
options(width=120)
p <- ggplot(bs, aes(x = tissue, y = m.beta, fill = chr)) +
geom_boxplot() + theme_minimal() +
theme(text=element_text(size=10),
axis.text.x=element_text(angle=60, hjust=1)) +
xlab("Tissue") +
ylab("Mean Beta") +
scale_fill_discrete(name = "Chromosome")
print(p)
## The violin plot The barplot gives us valuable information about the distribution, but is not as informative as the full distribution. The violin plot is a way to better visualize the full distribution. Not surprisingly, the plot function is
geom_violin.
p <- ggplot(bs, aes(x = tissue, y = m.beta)) +
geom_violin() + theme_minimal() +
theme(text=element_text(size=10),
axis.text.x=element_text(angle=60, hjust=1)) +
xlab("Tissue") +
ylab("Mean Beta")
print(p)
It is not uncommon to plot the points on top of the violins. Our data happens to have a very large number of points. In order to “thin” out the data, we can use the
alpha parameter in the geom_jitter function.
p <- p + geom_jitter(width = 0.1, alpha=.01)
print(p)
ggplot makes adding labels to plots pretty easy. As an example, let’s label the genes that have the larges and smallest value in each tissue on each chromosome. First, it’s useful to identify the minimum and maximum value for each tissue/chromosome.
bs <- bs %>% group_by(tissue, chr) %>%
mutate(min.lab = ifelse(m.beta == min(m.beta), gsub("ENSG0+","", symbol), ""),
max.lab = ifelse(m.beta == max(m.beta), gsub("ENSG0+","", symbol), ""),
lab = ifelse(m.beta %in% c(max(m.beta), min(m.beta)),
gsub("ENSG0+","", symbol), ""))
Now we use geom_text.
options(width=120)
p <- ggplot(bs, aes(x = tissue, y = m.beta)) +
geom_boxplot() +
geom_text(aes(label = max.lab), size = 2, angle=0, vjust=-1) +
geom_text(aes(label = min.lab), size = 2, angle=0, vjust=2) +
theme_minimal() +
theme(text=element_text(size=10),
axis.text.x=element_text(angle=60, hjust=1)) +
xlab("Tissue") +
ylab("Mean Beta")
print(p)
When points are this close together, it’s really trick to get the labels to look nice. There is a fantastic package called repel which can help. Here’s how you use it.
options(width=120)
p <- ggplot(bs, aes(x = tissue, y = m.beta)) +
geom_boxplot() +
geom_text_repel(data = bs, aes(label = max.lab), size = 2, box.padding = .1,
nudge_x=-.5, angle=0) +
geom_text_repel(data = bs, aes(label = min.lab), size = 2, box.padding = .1,
nudge_x=-.5, angle=0) +
theme_minimal() +
theme(text=element_text(size=10),
axis.text.x=element_text(angle=60, hjust=1)) +
xlab("Tissue") +
ylab("Mean Beta")
print(p)
Recall our boxplot that printed the distribution of betas for chromosomes 1, 6 and 20 next to each other. As you can image, if we labelled gene names on that plot it would be a mess. For this reason, and others, sometimes it is better to repeat a simpler plot multiple times as a function of an addition variable (e.g. chromosome) in an organized way. One way to do this is to use facet_grid. First, in order to make the plot look a little nicer we will add “chr” to the front of the chromosome number.
bs$chr <- factor(paste0("chr", bs$chr), levels = c("chr1","chr6","chr20"))
options(width=120)
p <- ggplot(bs, aes(x = tissue, y = m.beta)) +
geom_boxplot() +
facet_grid(chr~.) +
theme_minimal() +
theme(text=element_text(size=10),
axis.text.x=element_text(angle=60, hjust=1)) +
xlab("Tissue") +
ylab("Mean Beta")
print(p)
If you prefer have chromsome along the verticle axis, just change the chr~. to .~chr
options(width=120)
p <- ggplot(bs, aes(x = tissue, y = m.beta)) +
geom_boxplot() +
facet_grid(.~chr) +
theme_minimal() +
theme(text=element_text(size=10),
axis.text.x=element_text(angle=60, hjust=1)) +
xlab("Tissue") +
ylab("Mean Beta")
print(p)
## Plotting similarity Next we are going to explore heatmaps, which are useful for comparing the pairwise correlation or other similarity measures between variables. For our data, we will calculate the proportion of eQTL shared between tissues. We can do this using the
ps data frame which has tissues as columns, variant-gene combinations as rows, and each cell contains a p-value is the variant-gene is an eQTL for the tissue and NA otherwise. We will turn these into counts in the following way:
tissSel <- c("Adipose_Subcutaneous",
"Brain_Frontal_Cortex_BA9",
"Breast_Mammary_Tissue","Colon_Transverse", "Esophagus_Mucosa",
"Heart_Left_Ventricle","Liver", "Lung", "Muscle_Skeletal",
"Spleen", "Skin_Sun_Exposed_Lower_leg","Stomach")
tissim <- ps[, tissSel]
tissim <- tissim[rowSums(!is.na(tissim)) > 0 , ]
tissim <- 1*(is.na(tissim) == FALSE)
tissim <- Matrix(tissim, sparse=T)
tisNshare <- t(tissim) %*% tissim
tisNeither<- nrow(tissim) - t(tissim == 0) %*% (tissim == 0)
tisShareProp <- tisNshare/tisNeither
tisShareProp <- as.matrix(tisShareProp)
tisShareProp <- as.data.table(tisShareProp, stringAsFactor=FALSE)
tisShareProp$Tissue1 <- names(tisShareProp)
toplot <- melt(tisShareProp, id = "Tissue1", variable.name="Tissue2", value.name = "PropShare")
toplot <- toplot %>% mutate(Tissue1 = factor(Tissue1, levels=names(tisShareProp)),
Tissue2 = factor(Tissue2, levels=names(tisShareProp)))
toplot <- toplot %>% filter(Tissue1 != Tissue2)
O.k. Now that we’ve calculate the proportion of eQTL that tissue pairs share in common, we can plot them in a number of ways. One simple way is to use a scatter plot.
p <- ggplot(toplot, aes(x=Tissue1, y=Tissue2)) +
geom_point(aes(size = PropShare, color = PropShare)) +
scale_size_continuous(name = "Prop eQTL Shared",
range=c(.5,14),
limits=c(.1,.45)) +
theme(axis.text.x = element_text(angle = 45),
text=element_text(size=10, hjust=1),
axis.text.y = element_text(angle = 45),
axis.title = element_text(hjust=.5)) +
xlab("Tissue") + ylab("Tissue")
## COLOR LABELS TO HELP IDENTIFY WHO IS WHO ##
#theme(axis.text.x = element_text(angle = 90, color = STclr)) +
#scale_color_manual(values=clrLvl)
print(p)
A more common alternative for plotting similarity data is a heatmap. The way to make a heatmap using ggplot is using geom_tile, although many folks prefer alternative heatmapping functions. We’ll explore one of them shortly. Geom_tile works very much like